1. Create ExpressionSet * Clone http://github.com/mdozmorov/63_immune_cells repository on your computer.

a. Create Expression matrix - Read in data/counts.txt.gz

fulldata<-read.table("counts.txt.gz",header=TRUE, sep="\t", as.is=T)

b. Note the first 6 columns are gene annotation columns. Extract them into a separate variable - this is a part of your feature annotation data.

annot<-fulldata[,1:6]

c. Extract the other columns into a separate variable -this is your expression matrix.

expremtx<- fulldata[, -c(1:6)]

d. Add row names (probe IDs) to the expression matrix, extracted from column “Geneid”. What type of probe ID is it?

rownames(expremtx) <- fulldata$Geneid

These are Human being probe IDs by ensembl.

e. Create feature annotation matrix.

-Using the first 6 column extracted above, add gene symbol and description to the corresponding probe IDs. Use biomaRt, merge the returned data. Make sure the number and the order of probe IDs match! How many probe IDs did not have mapping? How many probe IDs have description but no gene symbol?

library(biomaRt)
library(dplyr)

mart <- useMart("ensembl", dataset="hsapiens_gene_ensembl") 
genes<-getBM(attributes=c('ensembl_gene_id','hgnc_symbol','description'), 
filters='ensembl_gene_id', values=fulldata$Geneid, mart=mart)#, uniqueRows=T)


annotII <- left_join(annot, genes, by = c("Geneid" = "ensembl_gene_id"))
annotII <- annotII[!duplicated(annotII$Geneid),]

# Probe ID with no match
sum(is.na(annotII$hgnc_symbol))
## [1] 9289
#With description but no gene symbol
length(which(genes$hgnc_symbol=="" & genes$description !=""))
## [1] 1503

f. Create sample annotation matrix - Read in data/E-MTAb-2319,sdrf.txt -this is your sample annotation data. How many different cell types are there? How many replicates per cell types?

anottIII <- read.table("E-MTAB-2319.sdrf.txt", sep="\t", header = T, as.is = T)

a<-table(anottIII$Characteristics.cell.type.)
length(a)
## [1] 13
a
## 
##               B CD5            B Memory             B Naive 
##                   8                  10                  10 
##  CD4 Central Memory CD4 Effector Memory           CD4 Naive 
##                  10                  10                  10 
##             CD4 Th1            CD4 Th17             CD4 Th2 
##                  10                  10                  10 
##            CD4 Treg  CD8 Central Memory CD8 Effector Memory 
##                  10                   8                  10 
##           CD8 Naive 
##                  10

There are 13 cell types. There are some with 8 and some with 10 replicates per cell type.

table(anottIII$Source.Name)
## 
## SQ_0007 SQ_0011 SQ_0014 SQ_0015 SQ_0021 SQ_0022 SQ_0023 SQ_0046 SQ_0047 
##       2       2       2       2       2       2       2       2       2 
## SQ_0048 SQ_0050 SQ_0051 SQ_0052 SQ_0053 SQ_0055 SQ_0056 SQ_0058 SQ_0059 
##       2       2       2       2       2       2       2       2       2 
## SQ_0065 SQ_0067 SQ_0080 SQ_0081 SQ_0082 SQ_0096 SQ_0098 SQ_0130 SQ_0132 
##       2       2       2       2       2       2       2       2       2 
## SQ_0134 SQ_0136 SQ_0145 SQ_0146 SQ_0147 SQ_0148 SQ_0149 SQ_0150 SQ_0151 
##       2       2       2       2       2       2       2       2       2 
## SQ_0152 SQ_0153 SQ_0154 SQ_0155 SQ_0156 SQ_0157 SQ_0158 SQ_0159 SQ_0160 
##       2       2       2       2       2       2       2       2       2 
## SQ_0161 SQ_0162 SQ_0163 SQ_0164 SQ_0165 SQ_0166 SQ_0167 SQ_0168 SQ_0169 
##       2       2       2       2       2       2       2       2       2 
## SQ_0170 SQ_0171 SQ_0172 SQ_0173 SQ_0174 SQ_0175 SQ_0176 SQ_0177 SQ_0178 
##       2       2       2       2       2       2       2       2       2
pdata <- anottIII[which(!duplicated(anottIII$Comment.ENA_RUN.)),]
dim(pdata)
## [1] 63 36
rownames(pdata) <- pdata$Comment.ENA_RUN.
pdata <- pdata[,-which(colnames(pdata)=="Comment.ENA_RUN.")]

colnames(expremtx) <- substr(colnames(expremtx),1,9)

all(rownames(pdata)==colnames(expremtx))
## [1] FALSE
expremtx <- expremtx[,order(colnames(expremtx))]
pdata <- pdata[order(rownames(pdata)),]

all(rownames(pdata)==colnames(expremtx))
## [1] TRUE

g. Create a minimal ExpressionSet using expression (assayData) and sample annotation (phenoData) matrix. Print the resulting ExpressionSet.

library(Biobase)

phenoChar <- new("AnnotatedDataFrame", data = pdata)

minphenodata<-ExpressionSet(assayData=as.matrix(expremtx),phenoData = phenoChar)

minphenodata
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 62069 features, 63 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: ERR431566 ERR431567 ... ERR431628 (63 total)
##   varLabels: Source.Name Comment.ENA_SAMPLE. ...
##     Factor.Value.cell.type. (35 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:

2. Exploratory data analysis

a. \(log_{2}\) transform the expression matrix +1. Why add 1 when log-transforming?

 Ltransex <- log2(expremtx + 1)

One is added to avoid having negative infinity.

b. Do boxplot on the expression matrix. Write observation about the data.

boxplot(Ltransex)

c. How many rows with all zeros are there? Exclude them from the expression matrix.

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
length(which(rowSums(Ltransex)==0))
## [1] 12087

Creating the expressionset with rows that have atleast one non-zero entry.

nonzero <- Ltransex[rowSums(Ltransex) > 0, ]

d. Get the list of housekeeping genes from http://www.tau.ac.il/~elieis/HKG/

e. Separate the expression matrix into two matrices, one containing expression of housekeeping genes and another containing all other genes. What is the mean/median standard deviation across samples of housekeeping genes? Of other genes? If you are to compare two distributions of standard deviations - which test would you use? Applied to the current data, are the standard deviations of housekeeping genes different from the rest of the genes?

nonzero<-as.data.frame(nonzero)
nonzero$Geneid <- rownames(nonzero)
datwithid<-merge(nonzero, annotII[,c(1,7)], by="Geneid")

exphousek<-datwithid[which(datwithid$hgnc_symbol %in% hkgene$V1),]

exphousek<-subset(exphousek, select=-c(hgnc_symbol,Geneid))
expnonhous<-datwithid[-which(datwithid$hgnc_symbol %in% hkgene$V1),]

expnonhous<-subset(expnonhous, select=-c(hgnc_symbol,Geneid))
nonzero<-subset(nonzero, select=-Geneid)

 #finding mean, median of standard deviation
house<-sapply(exphousek, function(x) sd=sd(x))
nonhouse<-sapply(expnonhous, function(x) sd=sd(x))

mean(house)
## [1] 1.922431
mean(nonhouse)
## [1] 2.941408
median(house)
## [1] 1.89974
median(house)
## [1] 1.89974
ks.test(house, nonhouse)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  house and nonhouse
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided

f. Summarize median gene expression per cell type. Keep rows annotates with gene symbols. Display the summary expression matrix (gene symbols as rows, cell types as columns, each cell is median expression) as DT::datatable. Optional: Highlight top 100 highest expressed genes for each cell type in the table.1.

library(DT)
#Number of empty space with just the quotation mark.
length(unique(annotII$hgnc_symbol))
## [1] 34768
# Number of NAs
length(which(is.na(annotII$hgnc_symbol))) 
## [1] 9289
#Number of unique gene symbol.
length(which(annotII$hgnc_symbol == ""))
## [1] 16307
symname<-unique(na.omit(annotII$hgnc_symbol[annotII$hgnc_symbol != ""])) # Exclude Symbols with NAs and "" (duplicates are excluded with unique)
genidname<-annotII$Geneid[which(annotII$hgnc_symbol %in% symname)]

lgexpmat<-subset(nonzero, (rownames(nonzero) %in% genidname))

symnamecln<-annotII$hgnc_symbol[which(annotII$Geneid %in% rownames(lgexpmat))]

samplebycelltypes<-split(rownames(pData(minphenodata)),pData(minphenodata)$Characteristics.cell.type)
summary.express.mat<-as.data.frame(lapply(samplebycelltypes,function(x) { apply(lgexpmat[,x],1,median) }) )

datatable(summary.express.mat, rownames=symnamecln,colnames=colnames(summary.express.mat))
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html